library(here)
## here() starts at /Users/tania/Documents/Git_Repos/data-viz-projects
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 3.4.2
## ── Attaching packages ───────────────────────────────────────── tidyverse 1.2.1 ──
## ✔ ggplot2 2.2.1     ✔ purrr   0.2.4
## ✔ tibble  1.3.4     ✔ dplyr   0.7.4
## ✔ tidyr   0.7.2     ✔ stringr 1.2.0
## ✔ readr   1.1.1     ✔ forcats 0.2.0
## Warning: package 'tibble' was built under R version 3.4.1
## Warning: package 'tidyr' was built under R version 3.4.2
## Warning: package 'purrr' was built under R version 3.4.2
## Warning: package 'dplyr' was built under R version 3.4.2
## ── Conflicts ──────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
library(ggplot2)

1 Initial play with the data

Loading the data

housing <- read.csv(here('data', 'landdata-states.csv'))
head(housing)
##   State region    Date Home.Value Structure.Cost Land.Value
## 1    AK   West 2010.25     224952         160599      64352
## 2    AK   West 2010.50     225511         160252      65259
## 3    AK   West 2009.75     225820         163791      62029
## 4    AK   West 2010.00     224994         161787      63207
## 5    AK   West 2008.00     234590         155400      79190
## 6    AK   West 2008.25     233714         157458      76256
##   Land.Share..Pct. Home.Price.Index Land.Price.Index Year Qrtr
## 1             28.6            1.481            1.552 2010    1
## 2             28.9            1.484            1.576 2010    2
## 3             27.5            1.486            1.494 2009    3
## 4             28.1            1.481            1.524 2009    4
## 5             33.8            1.544            1.885 2007    4
## 6             32.6            1.538            1.817 2008    1

Creating a histogram

hist(housing$Home.Value)

ggplot(housing, aes(x = Home.Value)) +
  geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Base colored scatter plot

plot(Home.Value ~ Date,
     data=subset(housing, State == "MA"))
points(Home.Value ~ Date, col="purple",
       data=subset(housing, State == "TX"))
legend(1975, 400000,
       c("MA", "TX"), title="State",
       col=c("black", "purple"),
       pch=c(1, 1))

Using ggplot

ggplot(subset(housing, State %in% c("MA", "TX")),
       aes(x=Date,
           y=Home.Value,
           color=State))+
  geom_point()

1.1 Scatter plots

hp2001Q1 <- subset(housing, Date == 2001.25) 
ggplot(hp2001Q1, aes(y = Structure.Cost, x = log(Land.Value))) +
  geom_point()

Adding prediction line

hp2001Q1$pred.SC <- predict(lm(Structure.Cost ~ log(Land.Value), data = hp2001Q1))

plt <- ggplot(hp2001Q1, aes(x = log(Land.Value), y = Structure.Cost))

plt + geom_point(aes(color = Home.Value)) +
  geom_line(aes(y = pred.SC))

Adding labels to the points

plt <- ggplot(hp2001Q1, aes(x = log(Land.Value), y = Structure.Cost))

plt + geom_text(aes(label=State), size = 3)

1.2 Aesthetic Mapping VS Assignment

Note that variables are mapped to aesthetics with the aes() function, while fixed aesthetics are set outside the aes() call.

1.3 Mapping variables to other aesthetics

plt + geom_point(aes(color = Home.Value, shape = region))
## Warning: Removed 1 rows containing missing values (geom_point).

1.3.1 Statistical transformations

housing.sum <- aggregate(housing["Home.Value"], housing["State"], FUN = mean)

ggplot(housing, aes(x = State, y = Home.Value)) +
  geom_bar(stat = 'identity')

2 Scales

2.1 Scales: Controlling aesthetics mapping

Aesthetic mapping (i.e., with aes()) only says that a variable should be mapped to an aesthetic. It doesn’t say how that should happen. For example, when mapping a variable to shape with aes(shape = x) you don’t say what shapes should be used. Similarly, aes(color = z) doesn’t say what colors should be used. Describing what colors/shapes/sizes etc. to use is done by modifying the corresponding scale. In ggplot2 scales include

  • position
  • color and fill
  • size
  • shape
  • line type Scales are modified with a series of functions using a scale_ naming scheme. Try typing scale to see a list of scale modification functions.

3 Common Scale Arguments

The following arguments are common to most scales in ggplot2:

Specific scale functions may have additional arguments; for example, the scale_color_continuous function has arguments low and high for setting the colors at the low and high end of the scale.

3.1 Scale Modification Examples

Start by constructing a dotplot showing the distribution of home values by Date and State.

p1 <- ggplot(housing, 
             aes(x = State, y = Home.Price.Index)) +
  theme(legend.position = "top",
        axis.text = element_text(size = 6))
(p2 <- p1 + geom_point(aes(color = Date),
                       alpha = 0.6, size = 1.5,
                       position = position_jitter(width = 0.25, height = 0)))

Change the breaks for the x axis and color scales

p2 + scale_x_discrete(name="State Abbreviation") +
  scale_color_continuous(name="",
                         breaks = c(1976, 1994, 2013),
                         labels = c("'76", "'94", "'13"))

Changing the colour schemes

p2 + scale_x_discrete(name="State Abbreviation") +
  scale_color_continuous(name="",
                         breaks = c(1976, 1994, 2013),
                         labels = c("'76", "'94", "'13"),
                         low = "blue", high = "purple")

library(scales)
## 
## Attaching package: 'scales'
## The following object is masked from 'package:purrr':
## 
##     discard
## The following object is masked from 'package:readr':
## 
##     col_factor
p2 + scale_color_continuous(name="",
                         breaks = c(1976, 1994, 2013),
                         labels = c("'76", "'94", "'13"),
                         low = muted("blue"), high = muted("red"))

3.2 Using different color scales

scale_color_gradient2 will interpolate between three different colors

p2 + scale_color_gradient2(name="",
                        breaks = c(1976, 1994, 2013),
                        labels = c("'76", "'94", "'13"),
                        low = muted("blue"),
                        high = muted("red"),
                        mid = "gray60",
                        midpoint = 1994)

4 Faceting

p5<- ggplot(housing, aes(x = Date, y =Home.Value))

p5 + geom_line(aes(color = State))

4.1 Faceting

(p5 <- p5 +geom_line(aes(color = State)) +
  facet_wrap(~State, ncol = 10) +
   theme(legend.position ="None"))

4.2 Themes

p5 + theme_dark() +
  theme( text =  element_text(color = "turquoise"))